package com.lzp.spider.service;

import com.lzp.spider.dao.JobDetailRepository;
import com.lzp.spider.pojo.JobDetail;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import java.util.List;
import java.util.Random;
/**
 * 爬虫程序
 */
@Service
public class SpiderService implements PageProcessor {
    //因为不同的页面要解析的内容不同，所以这里写死了url
    static String URL="https://resume.shixiseng.com/interns?type=school&keyword=Java&page=1";

    @Autowired
    private JobDetailRepository jobDetailRepository;

    //抓取⽹站的相关配置，包括编码、抓取间隔、重试次数、代理、UserAgent等
    private Site site=Site.me().
            setCharset("utf-8").
            setSleepTime(new Random().nextInt(20)*1000).
            setRetryTimes(3);

    //提取数据，对数据进行持久化
    @Override
    public void process(Page page) {
        String urlNum=URL.substring(URL.indexOf("page=")+5,URL.length());
        List<Selectable> nodes = page.getHtml().xpath("//div[@class='intern-wrap intern-item']").nodes();

        for (Selectable node:nodes) {
            //通过Xpath解析信息
            //分析实习僧⽹⻚，能够获取的关键信息包括公司名称，⽹申⼊⼝URL、⼯作城市列表、岗位城市列表、发布时间等
            //公司名称
            String company = node.xpath("//div[@class='f-r intern-detail__company']/p/a/text()").get();
            //公司薪水
            String salary = node.xpath("//div[@class='f-l intern-detail__job']/p/span/text()").get();
            //工作名称
            String job = node.xpath("//div[@class='f-l intern-detail__job']/p/a/text()").get();
            //公司简介
            String produce = node.xpath("//div[@class='f-r intern-detail__company']/p/span/text()").get();
            //公司位置
            String city = node.xpath("//span[@class='city ellipsis']/text()").get();
            //网申URL
            String url = node.xpath("//div[@class='f-l intern-detail__job']/p/a/@href").get();
            //System.out.println("公司名称:"+company+", 招聘岗位:"+job+", 公司薪资:"+salary+", 公司简介:"+produce+" ,公司城市:"+city+", 网申URL:"+url);
            //System.out.println("########################################");

            //数据持久化
            JobDetail jobDetail = new JobDetail();
            jobDetail.setCompany_name(company);
            jobDetail.setSalary(salary);
            jobDetail.setJob_name(job);
            jobDetail.setCompany_detail(produce);
            jobDetail.setCity(city);
            jobDetail.setUrl(url);
            jobDetailRepository.save(jobDetail);
        }
        //页数加一
        Integer pageIndex = Integer.valueOf(urlNum)+1;
        //停止爬虫的条件
        if(pageIndex>3) return;
        //替换掉url中的页数
        URL = URL.replace(URL.substring(URL.indexOf("page="),URL.length()),"page="+String.valueOf(pageIndex));
        //跳转下一页
        page.addTargetRequest(URL);
    }

    @Override
    public Site getSite() {
        return site;
    }

    public String getURL(){
        return URL;
    }
}


